{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "import gin\n",
    "import sentencepiece as spm\n",
    "from glob import glob\n",
    "import os\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')\n",
    "vocab = 'sp10m.cased.ms-en.model'\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/corpus/goemotions/goemotions_1.translated.csv\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/corpus/goemotions/goemotions_2.translated.csv\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/corpus/goemotions/goemotions_3.translated.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['goemotions_3.translated.csv',\n",
       " 'goemotions_2.translated.csv',\n",
       " 'goemotions_1.translated.csv']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>id</th>\n",
       "      <th>author</th>\n",
       "      <th>subreddit</th>\n",
       "      <th>link_id</th>\n",
       "      <th>parent_id</th>\n",
       "      <th>created_utc</th>\n",
       "      <th>rater_id</th>\n",
       "      <th>example_very_unclear</th>\n",
       "      <th>admiration</th>\n",
       "      <th>...</th>\n",
       "      <th>optimism</th>\n",
       "      <th>pride</th>\n",
       "      <th>realization</th>\n",
       "      <th>relief</th>\n",
       "      <th>remorse</th>\n",
       "      <th>sadness</th>\n",
       "      <th>surprise</th>\n",
       "      <th>neutral</th>\n",
       "      <th>translate_nmt</th>\n",
       "      <th>translate_replace</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>That game hurt.</td>\n",
       "      <td>eew5j0j</td>\n",
       "      <td>Brdd9</td>\n",
       "      <td>nrl</td>\n",
       "      <td>t3_ajis4z</td>\n",
       "      <td>t1_eew18eq</td>\n",
       "      <td>1.548381e+09</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Permainan itu sakit.</td>\n",
       "      <td>That permainan sakit hati .</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&gt;sexuality shouldn’t be a grouping category I...</td>\n",
       "      <td>eemcysk</td>\n",
       "      <td>TheGreen888</td>\n",
       "      <td>unpopularopinion</td>\n",
       "      <td>t3_ai4q37</td>\n",
       "      <td>t3_ai4q37</td>\n",
       "      <td>1.548084e+09</td>\n",
       "      <td>37</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>&gt; seksualiti tidak boleh menjadi kategori peng...</td>\n",
       "      <td>&gt; seksualiti sepatutnya tidak jadi a kumpulan ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>You do right, if you don't care then fuck 'em!</td>\n",
       "      <td>ed2mah1</td>\n",
       "      <td>Labalool</td>\n",
       "      <td>confessions</td>\n",
       "      <td>t3_abru74</td>\n",
       "      <td>t1_ed2m7g7</td>\n",
       "      <td>1.546428e+09</td>\n",
       "      <td>37</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Anda melakukan yang betul, jika anda tidak ped...</td>\n",
       "      <td>You do right , jika awak do tidak jaga kemudia...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Man I love reddit.</td>\n",
       "      <td>eeibobj</td>\n",
       "      <td>MrsRobertshaw</td>\n",
       "      <td>facepalm</td>\n",
       "      <td>t3_ahulml</td>\n",
       "      <td>t3_ahulml</td>\n",
       "      <td>1.547965e+09</td>\n",
       "      <td>18</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Lelaki yang saya suka reddit.</td>\n",
       "      <td>Man I love reddit .</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[NAME] was nowhere near them, he was by the Fa...</td>\n",
       "      <td>eda6yn6</td>\n",
       "      <td>American_Fascist713</td>\n",
       "      <td>starwarsspeculation</td>\n",
       "      <td>t3_ackt2f</td>\n",
       "      <td>t1_eda65q2</td>\n",
       "      <td>1.546669e+09</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>[NAME] tidak ada di mana-mana berhampiran mere...</td>\n",
       "      <td>[ NAME ] adalah tiada tempat berhampiran them ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 39 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text       id  \\\n",
       "0                                    That game hurt.  eew5j0j   \n",
       "1   >sexuality shouldn’t be a grouping category I...  eemcysk   \n",
       "2     You do right, if you don't care then fuck 'em!  ed2mah1   \n",
       "3                                 Man I love reddit.  eeibobj   \n",
       "4  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   \n",
       "\n",
       "                author            subreddit    link_id   parent_id  \\\n",
       "0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   \n",
       "1          TheGreen888     unpopularopinion  t3_ai4q37   t3_ai4q37   \n",
       "2             Labalool          confessions  t3_abru74  t1_ed2m7g7   \n",
       "3        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   \n",
       "4  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   \n",
       "\n",
       "    created_utc  rater_id  example_very_unclear  admiration  ...  optimism  \\\n",
       "0  1.548381e+09         1                 False           0  ...         0   \n",
       "1  1.548084e+09        37                  True           0  ...         0   \n",
       "2  1.546428e+09        37                 False           0  ...         0   \n",
       "3  1.547965e+09        18                 False           0  ...         0   \n",
       "4  1.546669e+09         2                 False           0  ...         0   \n",
       "\n",
       "   pride  realization  relief  remorse  sadness  surprise  neutral  \\\n",
       "0      0            0       0        0        1         0        0   \n",
       "1      0            0       0        0        0         0        0   \n",
       "2      0            0       0        0        0         0        1   \n",
       "3      0            0       0        0        0         0        0   \n",
       "4      0            0       0        0        0         0        1   \n",
       "\n",
       "                                       translate_nmt  \\\n",
       "0                               Permainan itu sakit.   \n",
       "1  > seksualiti tidak boleh menjadi kategori peng...   \n",
       "2  Anda melakukan yang betul, jika anda tidak ped...   \n",
       "3                      Lelaki yang saya suka reddit.   \n",
       "4  [NAME] tidak ada di mana-mana berhampiran mere...   \n",
       "\n",
       "                                   translate_replace  \n",
       "0                        That permainan sakit hati .  \n",
       "1  > seksualiti sepatutnya tidak jadi a kumpulan ...  \n",
       "2  You do right , jika awak do tidak jaga kemudia...  \n",
       "3                                Man I love reddit .  \n",
       "4  [ NAME ] adalah tiada tempat berhampiran them ...  \n",
       "\n",
       "[5 rows x 39 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('goemotions_1.translated.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "goemotions_3.translated.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 71225/71225 [00:19<00:00, 3600.23it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "goemotions_2.translated.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 70000/70000 [00:19<00:00, 3607.38it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "goemotions_1.translated.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 70000/70000 [00:18<00:00, 3703.47it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "en_ms_translated = []\n",
    "en_ms_replaced = []\n",
    "ms_en_translated = []\n",
    "ms_en_replaced = []\n",
    "\n",
    "for f in glob('goemotions_*.translated.csv'):\n",
    "    print(f)\n",
    "    df = pd.read_csv(f)\n",
    "\n",
    "    for i in tqdm(range(len(df))):\n",
    "        en_ms_translated.append((df.iloc[i,0], df.iloc[i,-2]))\n",
    "        en_ms_replaced.append((df.iloc[i,0], df.iloc[i,-1]))\n",
    "        ms_en_translated.append((df.iloc[i,-2], df.iloc[i,0]))\n",
    "        ms_en_replaced.append((df.iloc[i,-1], df.iloc[i,0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 211225/211225 [00:04<00:00, 48736.72it/s]\n"
     ]
    }
   ],
   "source": [
    "with tf.io.gfile.GFile('en_ms_translated.tsv', \"w\") as outfile:\n",
    "    for i in tqdm(range(len(en_ms_translated))):\n",
    "        try:\n",
    "            l = cleaning(en_ms_translated[i][0])\n",
    "            r = cleaning(en_ms_translated[i][1])\n",
    "            outfile.write(\"%s\\t%s\\n\" % (l, r))\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 211225/211225 [00:04<00:00, 44135.35it/s]\n"
     ]
    }
   ],
   "source": [
    "with tf.io.gfile.GFile('en_ms_replaced.tsv', \"w\") as outfile:\n",
    "    for i in tqdm(range(len(en_ms_replaced))):\n",
    "        try:\n",
    "            l = cleaning(en_ms_replaced[i][0])\n",
    "            r = cleaning(en_ms_replaced[i][1])\n",
    "            outfile.write(\"%s\\t%s\\n\" % (l, r))\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 211225/211225 [00:04<00:00, 50799.15it/s]\n"
     ]
    }
   ],
   "source": [
    "with tf.io.gfile.GFile('ms_en_translated.tsv', \"w\") as outfile:\n",
    "    for i in tqdm(range(len(ms_en_translated))):\n",
    "        try:\n",
    "            l = cleaning(ms_en_translated[i][0])\n",
    "            r = cleaning(ms_en_translated[i][1])\n",
    "            outfile.write(\"%s\\t%s\\n\" % (l, r))\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 211225/211225 [00:04<00:00, 46304.85it/s]\n"
     ]
    }
   ],
   "source": [
    "with tf.io.gfile.GFile('ms_en_replaced.tsv', \"w\") as outfile:\n",
    "    for i in tqdm(range(len(ms_en_replaced))):\n",
    "        try:\n",
    "            l = cleaning(ms_en_replaced[i][0])\n",
    "            r = cleaning(ms_en_replaced[i][1])\n",
    "            outfile.write(\"%s\\t%s\\n\" % (l, r))\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "def social_en_ms_translated_dataset(split, shuffle_files = False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(\n",
    "        [\n",
    "            'ms_en_replaced.tsv'\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults = ['', ''],\n",
    "            field_delim = '\\t',\n",
    "            use_quote_delim = False,\n",
    "        ),\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))\n",
    "    return ds\n",
    "\n",
    "def social_en_ms_translated_preprocessor(ds):\n",
    "    def to_inputs_and_targets(ex):\n",
    "        return {\n",
    "            'inputs': tf.strings.join(['sosial en_ms_translated: ', ex['question']]),\n",
    "            'targets': ex['answer'],\n",
    "        }\n",
    "\n",
    "    return ds.map(\n",
    "        to_inputs_and_targets,\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "t5.data.TaskRegistry.remove('social_en_ms_translated_dataset')\n",
    "t5.data.TaskRegistry.add(\n",
    "    'social_en_ms_translated_dataset',\n",
    "    dataset_fn = social_en_ms_translated_dataset,\n",
    "    splits = ['train'],\n",
    "    text_preprocessor = [social_en_ms_translated_preprocessor],\n",
    "    sentencepiece_model_path = vocab,\n",
    "    metric_fns = [t5.evaluation.metrics.accuracy],\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"social_en_ms_translated_dataset\")\n",
    "ds = nq_task.get_dataset(split='paraphrase.tsv', sequence_length={\"inputs\": 1024, \"targets\": 1024})\n",
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'inputs_plaintext': b'sosial en_ms_translated: I do tidak ingat [ NAME ] saying itu tbh',\n",
       " 'inputs': array([  969,    13,   258,  2902,   117,    16,  2902, 12262,  5635,\n",
       "           72,    31,    59,   184,    30,  4148,   356,   517, 20414,\n",
       "           13,   332,  1238,    37,    13,    41,   380,   261,     1]),\n",
       " 'targets_plaintext': b'I don\\xe2\\x80\\x99t remember [NAME] saying that tbh',\n",
       " 'targets': array([   59,   367,     2,    41,  3548,   356,   584, 20414,   332,\n",
       "         1238,    27,    13,    41,   380,   261,     1])}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
